Drzewo danych
Drzewo danych
Drzewo danych
previous <- read.csv(file = "../previous_credits_data.csv")
current <- read.csv(file = "../data/parted/data_train.csv")
read.csv("../data/bureau_balance.csv", nrow = 5)
read.csv("../data/previous_application.csv", nrow = 5)
read.csv("../data/credit_card_balance.csv", nrow = 5)
installments %>% select(SK_ID_PREV, SK_ID_CURR, DAYS_INSTALMENT,
DAYS_ENTRY_PAYMENT, AMT_INSTALMENT, AMT_PAYMENT) %>%
mutate(DAYS_DELAY = DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT,
AMT_UNDERPAYMENT = AMT_INSTALMENT - AMT_PAYMENT) %>%
mutate(STATUS = if_else(is.na(DAYS_DELAY), "X",
if_else(DAYS_DELAY <= 0, "0",
if_else(DAYS_DELAY <= 30, "1",
if_else(DAYS_DELAY <= 60, "2",
if_else(DAYS_DELAY <= 90, "3",
if_else(DAYS_DELAY <= 120, "4", "5"))))))) %>%
mutate(STATUS_X = if_else(STATUS == "X", 1, 0),
STATUS_0 = if_else(STATUS == "0", 1, 0),
STATUS_1 = if_else(STATUS == "1", 1, 0),
STATUS_2 = if_else(STATUS == "2", 1, 0),
STATUS_3 = if_else(STATUS == "3", 1, 0),
STATUS_4 = if_else(STATUS == "4", 1, 0),
STATUS_5 = if_else(STATUS == "5", 1, 0)) %>%
select(SK_ID_PREV, SK_ID_CURR, STATUS_X, STATUS_0, STATUS_1, STATUS_2, STATUS_3, STATUS_4, STATUS_5, DAYS_INSTALMENT) %>%
group_by(SK_ID_CURR, SK_ID_PREV) %>%
summarise(STATUS_X = sum(STATUS_X), STATUS_0 = sum(STATUS_0), STATUS_1 = sum(STATUS_1), STATUS_2 = sum(STATUS_2),
STATUS_3 = sum(STATUS_3), STATUS_4 = sum(STATUS_4), STATUS_5 = sum(STATUS_5), IS_ACTIVE = max(DAYS_INSTALMENT) > -31) %>%
mutate(SUM_LATE_MONTHS = STATUS_1 + STATUS_2 * 2 + STATUS_3 * 3 + STATUS_4 * 4 + STATUS_5 * 5) %>%
mutate(SUM_MONTHS = STATUS_1 + STATUS_2 + STATUS_3 + STATUS_4 + STATUS_5 + STATUS_0 + STATUS_X) %>%
mutate(RATIO_LATE_MONTHS = SUM_LATE_MONTHS/SUM_MONTHS) -> inst_mod
JOINED_prev_inst <- left_join(previous_apps, inst_mod, by = c("SK_ID_PREV", "SK_ID_CURR"))
credit_cards %>% select(SK_ID_PREV, SK_ID_CURR, AMT_PAYMENT_TOTAL_CURRENT, AMT_INST_MIN_REGULARITY, MONTHS_BALANCE) %>%
mutate(IS_NOT_PAID = if_else(AMT_PAYMENT_TOTAL_CURRENT - AMT_INST_MIN_REGULARITY < 0, 1, 0)) %>%
group_by(SK_ID_PREV, SK_ID_CURR) %>%
summarise(SUM_LATE_MONTHS = sum(IS_NOT_PAID),
SUM_MONTHS = n(),
RATIO_LATE_MONTHS = SUM_LATE_MONTHS/SUM_MONTHS,
IS_ACTIVE = max(MONTHS_BALANCE) >= -1) -> cards_mod
left_join(JOINED_prev_inst, cards_mod, by = c("SK_ID_PREV", "SK_ID_CURR"))%>%
mutate(SUM_LATE_MONTHS =
if_else(is.na(SUM_LATE_MONTHS.y), if_else(is.na(SUM_LATE_MONTHS.x), 0, SUM_LATE_MONTHS.x),
if_else(is.na(SUM_LATE_MONTHS.x), SUM_LATE_MONTHS.y, SUM_LATE_MONTHS.x + SUM_LATE_MONTHS.y))) %>%
mutate(SUM_MONTHS =
if_else(is.na(SUM_MONTHS.y), if_else(is.na(SUM_MONTHS.x), 0, SUM_MONTHS.x),
as.double(if_else(is.na(SUM_MONTHS.x), SUM_MONTHS.y, as.integer(SUM_MONTHS.x + SUM_MONTHS.y))))) %>%
mutate(SUM_MONTHS = if_else(SUM_MONTHS < SUM_LATE_MONTHS, SUM_LATE_MONTHS, SUM_MONTHS)) %>%
mutate(RATIO_LATE_MONTHS = if_else(is.na(SUM_MONTHS),0,SUM_LATE_MONTHS/SUM_MONTHS)) %>%
mutate(STATUS_0 = SUM_MONTHS - SUM_LATE_MONTHS) %>%
mutate(STATUS_1 = if_else(!is.na(SUM_LATE_MONTHS.y) & !is.na(SUM_MONTHS.y),
if_else(is.na(STATUS_1), SUM_LATE_MONTHS.y, STATUS_1 + SUM_LATE_MONTHS.y),
if_else(is.na(STATUS_1), 0, STATUS_1))) %>%
mutate(STATUS_X = if_else(is.na(SUM_LATE_MONTHS.y) & !is.na(SUM_MONTHS.y),
if_else(is.na(STATUS_X), SUM_MONTHS.y, as.integer(STATUS_X + SUM_MONTHS.y)), as.integer(STATUS_X))) %>%
mutate(IS_ACTIVE = if_else(!is.na(IS_ACTIVE.y) & !is.na(IS_ACTIVE.x), IS_ACTIVE.x | IS_ACTIVE.y,
if_else(is.na(IS_ACTIVE.y), IS_ACTIVE.x, IS_ACTIVE.y))) %>%
mutate(UNKNOWN_STATE = if_else(is.na(IS_ACTIVE), 1, 0)) %>%
select(-SUM_MONTHS.x, -SUM_MONTHS.y, -SUM_LATE_MONTHS.x, -SUM_LATE_MONTHS.y, -RATIO_LATE_MONTHS.x, -RATIO_LATE_MONTHS.y,
-IS_ACTIVE.x, -IS_ACTIVE.y) %>%
select(SK_ID_CURR, SK_ID_PREV, NAME_CONTRACT_TYPE, STATUS_X,
STATUS_0, STATUS_1, STATUS_2, STATUS_3, STATUS_4, STATUS_5,
SUM_MONTHS, SUM_LATE_MONTHS, RATIO_LATE_MONTHS, IS_ACTIVE, UNKNOWN_STATE,
NAME_CONTRACT_TYPE) %>%
filter(!is.na(SUM_MONTHS)) %>%
mutate(STATUS_X = if_else(is.na(STATUS_X), SUM_MONTHS-STATUS_1-STATUS_0,as.double(STATUS_X)),
STATUS_2 = if_else(is.na(STATUS_2), 0, as.double(STATUS_2)),
STATUS_3 = if_else(is.na(STATUS_3), 0, as.double(STATUS_3)),
STATUS_4 = if_else(is.na(STATUS_4), 0, as.double(STATUS_4)),
STATUS_5 = if_else(is.na(STATUS_5), 0, as.double(STATUS_5))) %>%
mutate(CASH_LOANS = if_else(NAME_CONTRACT_TYPE == "Cash loans", 1, 0),
CONSUMER_LOANS = if_else(NAME_CONTRACT_TYPE == "Consumer loans", 1, 0),
REVOLVING_LOANS = if_else(NAME_CONTRACT_TYPE == "Revolving loans", 1, 0),
UNKNOWN_TYPE_LOANS = if_else(NAME_CONTRACT_TYPE == "XNA", 1, 0)) %>%
group_by(SK_ID_CURR) %>%
summarise(STATUS_0 = sum(STATUS_0), STATUS_X = sum(STATUS_X), STATUS_1 = sum(STATUS_1),
STATUS_2 = sum(STATUS_2), STATUS_3 = sum(STATUS_3), STATUS_4 = sum(STATUS_4), STATUS_5 = sum(STATUS_5),
SUM_MONTHS = sum(SUM_MONTHS), SUM_LATE_MONTHS = sum(SUM_LATE_MONTHS), RATIO_LATE_MONTHS = SUM_LATE_MONTHS/SUM_MONTHS,
CASH_LOANS = sum(CASH_LOANS), CONSUMER_LOANS = sum(CONSUMER_LOANS), REVOLVING_LOANS = sum(REVOLVING_LOANS),
UNKNOWN_TYPE_LOANS = sum(UNKNOWN_TYPE_LOANS), ACTIVE = sum(IS_ACTIVE, na.rm = TRUE), CLOSED = sum(!IS_ACTIVE, na.rm = TRUE), SOLD = 0, BAD_DEBT = 0, UNKNOWN_STATE = 0) %>%
bind_rows(read.csv("previous_appliactions/grouped_data_plus_active.csv"))%>%
group_by(SK_ID_CURR) %>%
summarise(STATUS_0 = sum(STATUS_0, na.rm = TRUE),
STATUS_1 = sum(STATUS_1, na.rm = TRUE),
STATUS_2 = sum(STATUS_2, na.rm = TRUE),
STATUS_3 = sum(STATUS_3, na.rm = TRUE),
STATUS_4 = sum(STATUS_4, na.rm = TRUE),
STATUS_5 = sum(STATUS_5, na.rm = TRUE),
STATUS_X = sum(STATUS_X, na.rm = TRUE),
SUM_MONTHS = sum(SUM_MONTHS, na.rm = TRUE),
SUM_LATE_MONTHS = sum(SUM_LATE_MONTHS, na.rm = TRUE),
RATIO_LATE_MONTHS = if_else(is.na(SUM_MONTHS), 0, SUM_LATE_MONTHS/SUM_MONTHS),
CASH_LOANS = sum(CASH_LOANS, na.rm = TRUE),
CONSUMER_LOANS = sum(CONSUMER_LOANS, na.rm = TRUE),
REVOLVING_LOANS = sum(REVOLVING_LOANS, na.rm = TRUE),
UNKNOWN_TYPE_LOANS = sum(UNKNOWN_TYPE_LOANS, na.rm = TRUE),
ACTIVE = sum(ACTIVE, na.rm = TRUE),
CLOSED = sum(CLOSED, na.rm = TRUE),
SOLD = sum(SOLD, na.rm = TRUE),
BAD_DEBT = sum(BAD_DEBT, na.rm = TRUE),
UNKNOWN_STATE = sum(UNKNOWN_STATE, na.rm = TRUE)) -> previous_credits_data
Odsetek klientĂłw, ktĂłrzy brali wczeĆniej jakÄ Ć poĆŒyczkÄ:
current <- left_join(current, previous, by = "SK_ID_CURR")
sum(!is.na(current$SUM_MONTHS))/nrow(current)
## [1] 0.9917347
GĆĂłwnÄ ramkÄ danych jest application_train. Przedstawia ona wszystkie aplikacje. PoniĆŒsze wykresy zostaĆy wykonane na treningowym podzbiorze.
Ramka zawiera kolumnÄ TARGET z wartoĆciami 0 i 1. 1 oznacza klientĂłw, ktĂłrzy mieli problemy z pĆatnoĆciami, a 0 pozostaĆych.
Ponad 90% osĂłb nie mam problemĂłw z pĆatnoĆciami.
BiorÄ c pod uwagÄ miasto
Nie biorÄ c pod uwagÄ miasta
MoĆŒna zauwaĆŒyÄ znaczÄ cÄ rĂłĆŒnicÄ miÄdzy osobami z rankingiem 1 i 3.
Osoby nie pracujÄ ce w mieĆcie, w ktĂłrym sÄ zameldowane czÄĆciej majÄ problemy ze spĆatÄ kredytu.
Osoby bÄdÄ ce pracownikami fizycznymi najczÄĆciej majÄ problemy z terminowymi pĆatnoĆciami. Inne zagroĆŒone grupy to kierowcy, ochroniarze i ludzie pracujÄ cy w gastronomii.
Osoby mieszkajÄ ce z rodzicami lub wynajmujÄ ce mieszkanie czÄĆciej majÄ problemy ze spĆatÄ zobowiÄ zaĆ.
Osoby biorÄ
ce kredyt gotĂłwkowy majÄ
problemy finansowe czÄĆciej, niĆŒ osoby biorÄ
ce kredyt odnawialny. ### Ile osĂłb z otoczenia spoĆecznego osoby aplikujÄ
cej spĂłĆșniĆo siÄ ze spĆatÄ
60 dni
Osoby majÄ ce w otoczeniu spoĆecznym osoby, ktĂłre zalegajÄ ze spĆatÄ , same czÄĆciej majÄ problemy finansowe
Klienci, ktĂłrzy dostarczyli dokument 3, czÄsciej mieli problemy ze spĆatÄ kredytu, tak samo jak klienci, ktĂłrzy nie dostarczyli dokumentu 6.
NajwiÄkszÄ korelacjÄ ze zmiennÄ TARGET majÄ zmienne EXT_SOURCE_2 i EXT_SOURCE_3. Na wykresach skrzynkowych moĆŒna zauwaĆŒyÄ, ĆŒe osoby majÄ ce problemy ze spĆatÄ kredytu majÄ niĆŒszÄ ocenÄ w obu tych ĆșrĂłdĆach.
current <- current %>% filter(!is.na(SUM_MONTHS)) %>% select(TARGET, STATUS_0, STATUS_1, STATUS_2, STATUS_3, STATUS_4, STATUS_5, STATUS_X, SUM_MONTHS, SUM_LATE_MONTHS, RATIO_LATE_MONTHS, ACTIVE, CLOSED, BAD_DEBT, SOLD, UNKNOWN_STATE, SUM_MONTHS, CONSUMER_LOANS, REVOLVING_LOANS, UNKNOWN_TYPE_LOANS)
current$TARGET <- as.factor(current$TARGET)
ggplot(data = current, aes(x = log10(STATUS_0), fill = TARGET)) +
geom_histogram(position = position_identity(), alpha = 0.8) +
xlab("STATUS_0 value (log)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = current, aes(x = log10(STATUS_1), fill = TARGET)) +
geom_histogram(position = position_identity(), alpha = 0.8) +
xlab("STATUS_1 value (log)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = current, aes(x = log10(STATUS_2 + STATUS_3 + STATUS_4 + STATUS_5), fill = TARGET)) +
geom_histogram(position = position_identity(), alpha = 0.8) +
xlab("STATUS_>1 value (log)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = current, aes(x = log10(SUM_MONTHS), fill = TARGET)) +
geom_histogram(position = position_identity(), alpha = 0.8) +
xlab("SUM_MONTHS value (log10)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = current, aes(x = log10(ACTIVE), fill = TARGET)) +
geom_histogram(position = position_identity(), alpha = 0.8) +
xlab("ACTIVE value (log10)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = current, aes(x = log10(ACTIVE), fill = TARGET)) +
geom_histogram(position = position_identity(), alpha = 0.8) +
xlab("CLOSED value (log10)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
current %>% group_by(TARGET) %>% summarise(STATUS_0 = mean(STATUS_0), STATUS_1 = mean(STATUS_1), STATUS_m1 = mean(STATUS_2 + STATUS_3 + STATUS_4 + STATUS_5), ACTIVE = mean(ACTIVE), CLOSED = mean(CLOSED))
current$TARGET <- as.numeric(levels(current$TARGET))[as.integer(current$TARGET)]
visdat::vis_cor(current)